Let’s explore the data using PCA
library(tidyverse)
library(tictoc)
library(caret)
library(ggfortify)
library(MASS)
library(cluster)
source("./parameters.R")
# Open the bag of words
fileName = "bow_tfidf__min_words_100_2grams_1000__sampling_balanced__cor_cut_0.3_from_1408_to_1110_rm0.csv"
df = read_csv(fileName, col_types=col_types_df)
df = df[,-c(2,3,5:9)]
# During tests, we can work on a sample
sampled = TRUE
if (sampled == TRUE) {
set.seed(42)
max = nrow(df)
sampled = round(max/10)
df = df[sample(max, sampled), ]
}
# show the data set
df